In [1]:
import os
import conda

conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib

from mpl_toolkits.basemap import Basemap
In [2]:
# package imports
#basics
import numpy as np
import pandas as pd

#misc
import gc
import time
import warnings


#viz
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.gridspec as gridspec 
import matplotlib.gridspec as gridspec 

# graph viz
import plotly.offline as pyo
from plotly.graph_objs import *
import plotly.graph_objs as go

#map section
import imageio
import folium
import folium.plugins as plugins
from mpl_toolkits.basemap import Basemap



#graph section
import networkx as nx
import heapq  # for getting top n number of things from list,dict


#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
warnings.filterwarnings("ignore")
pyo.init_notebook_mode()

%matplotlib inline
import simplejson as json
In [3]:
def readReview(path, business):
    review=[]
    reviewdf = pd.DataFrame.from_records(review)
    with open(path + "review.json", 'rb') as f:
        for l in f:
            data = json.loads(l)
            review.append(data)
            if len(review) > 1000000:
                df = pd.DataFrame.from_records(review)
                df = pd.merge(df, business, on=['business_id'])
                review = []
                reviewdf = pd.concat([reviewdf, df])
        df = pd.DataFrame.from_records(review)
        df = pd.merge(df, business, on=['business_id'])
        review = []
        reviewdf = pd.concat([reviewdf, df])
    return reviewdf
def readBusiness(path):
    business=[]
    for l in open(path + "business.json", 'rb').readlines():
        business.append(json.loads(l))
    businessdf = pd.DataFrame.from_records(business)
    return businessdf
In [4]:
path = "/Users/xiaxun/Downloads/yelp_dataset/"
business = readBusiness(path)
business = business[business.state == "IL"]
business = business.drop(['stars'],axis=1)
review = readReview(path, business)
In [5]:
review.head(3)
Out[5]:
review_id user_id business_id stars useful funny cool text date name ... city state postal_code latitude longitude review_count is_open attributes categories hours
0 XsBgj268rbYJ5ljKj02JxA THirRdo5CwitTPyqV28NyQ E9HN0nA8eRsIBZYkL0H3xA 4.0 0 0 0 I got stuck in Myrtle Beach last week and stay... 2012-11-06 02:46:40 Courtyard by Marriott Champaign ... Champaign IL 61822 40.136355 -88.251516 21 1 {'RestaurantsPriceRange2': '2', 'BusinessAccep... Hotels & Travel, Event Planning & Services, Ho... {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...
1 Ib6SFU9TD552DQY7SxwRaQ MkIDdcerLigepann4ZPfdg E9HN0nA8eRsIBZYkL0H3xA 1.0 4 3 3 Unfortunately, I have to agree with the other ... 2012-08-17 22:29:16 Courtyard by Marriott Champaign ... Champaign IL 61822 40.136355 -88.251516 21 1 {'RestaurantsPriceRange2': '2', 'BusinessAccep... Hotels & Travel, Event Planning & Services, Ho... {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...
2 qcTPJy_vcQVUu0mmmbxTTw o6NcaAHsodkb5PU_kQQCvQ E9HN0nA8eRsIBZYkL0H3xA 5.0 0 0 0 Can't go wrong with 24 hour Starbuck's service... 2013-10-04 07:05:30 Courtyard by Marriott Champaign ... Champaign IL 61822 40.136355 -88.251516 21 1 {'RestaurantsPriceRange2': '2', 'BusinessAccep... Hotels & Travel, Event Planning & Services, Ho... {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...

3 rows × 21 columns

In [6]:
review.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 42371 entries, 0 to 4988
Data columns (total 21 columns):
review_id       42371 non-null object
user_id         42371 non-null object
business_id     42371 non-null object
stars           42371 non-null float64
useful          42371 non-null int64
funny           42371 non-null int64
cool            42371 non-null int64
text            42371 non-null object
date            42371 non-null object
name            42371 non-null object
address         42371 non-null object
city            42371 non-null object
state           42371 non-null object
postal_code     42371 non-null object
latitude        42371 non-null float64
longitude       42371 non-null float64
review_count    42371 non-null int64
is_open         42371 non-null int64
attributes      40599 non-null object
categories      42348 non-null object
hours           37070 non-null object
dtypes: float64(3), int64(5), object(13)
memory usage: 7.1+ MB
In [7]:
review.describe()
Out[7]:
stars useful funny cool latitude longitude review_count is_open
count 42371.000000 42371.000000 42371.000000 42371.000000 42371.000000 42371.000000 42371.000000 42371.000000
mean 3.582804 1.394067 0.405112 0.336409 40.106372 -88.272415 110.266739 0.846050
std 1.475696 4.946094 1.802702 0.971664 0.235679 0.863709 155.414679 0.360905
min 1.000000 0.000000 0.000000 0.000000 33.449413 -112.208011 3.000000 0.000000
25% 2.000000 0.000000 0.000000 0.000000 40.109222 -88.247552 19.000000 1.000000
50% 4.000000 1.000000 0.000000 0.000000 40.112708 -88.242677 50.000000 1.000000
75% 5.000000 2.000000 0.000000 0.000000 40.118337 -88.229538 125.000000 1.000000
max 5.000000 414.000000 100.000000 46.000000 40.319183 -87.831488 790.000000 1.000000
In [8]:
business.describe()
Out[8]:
latitude longitude review_count is_open
count 1932.000000 1932.000000 1932.000000 1932.000000
mean 40.111130 -88.270470 21.232402 0.799689
std 0.216012 0.771552 43.492499 0.400336
min 33.449413 -112.208011 3.000000 0.000000
25% 40.106692 -88.258481 4.000000 1.000000
50% 40.112714 -88.243300 9.000000 1.000000
75% 40.127048 -88.225122 20.000000 1.000000
max 40.319183 -87.831488 790.000000 1.000000
In [9]:
business.head()
Out[9]:
business_id name address city state postal_code latitude longitude review_count is_open attributes categories hours
289 tsXCDIijxbgsh980VgRc9g Federal Companies 401 W Kenyon Rd Champaign IL 61820 40.133919 -88.248628 11 1 {'BusinessAcceptsCreditCards': 'True', 'ByAppo... Movers, Local Services, Self Storage, Home Ser... {'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '...
330 fsklFcY47qJIr0mjgobuUg Ford City 300 Carriage Center Ct Champaign IL 61820 40.090444 -88.247430 3 0 None Car Dealers, Auto Repair, Auto Parts & Supplie... {'Monday': '8:0-19:0', 'Tuesday': '8:0-19:0', ...
356 mofOjB6flg-eAWOFbOkHfQ ChinaTown Buffet 713 W Marketview Dr Champaign IL 61822 40.137270 -88.256043 72 1 {'WiFi': 'u'no'', 'RestaurantsReservations': '... Sushi Bars, Buffets, Restaurants, Chinese {'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...
361 P4HqDYI1icascvcwca7iLg Macy's 2000 N Neil St Champaign IL 61820 40.141029 -88.244222 12 1 {'BusinessAcceptsCreditCards': 'True', 'Restau... Men's Clothing, Shopping, Department Stores, F... {'Monday': '10:0-20:0', 'Tuesday': '9:0-22:0',...
368 eezVjNlzIZrXs9GM5O8b2w Vape Vault 723 S Neil St Champaign IL 61820 40.107126 -88.243912 5 1 {'RestaurantsPriceRange2': '2', 'BusinessAccep... Shopping, Vape Shops {'Monday': '0:0-0:0', 'Tuesday': '9:0-21:0', '...
In [10]:
business.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1932 entries, 289 to 192521
Data columns (total 13 columns):
business_id     1932 non-null object
name            1932 non-null object
address         1932 non-null object
city            1932 non-null object
state           1932 non-null object
postal_code     1932 non-null object
latitude        1932 non-null float64
longitude       1932 non-null float64
review_count    1932 non-null int64
is_open         1932 non-null int64
attributes      1671 non-null object
categories      1927 non-null object
hours           1433 non-null object
dtypes: float64(2), int64(2), object(9)
memory usage: 211.3+ KB

Data Cleaning & Management

Cleaning of business data

In [11]:
city = list(business['city'].unique())
city.sort()
city
Out[11]:
['Broadlands',
 'Champaign',
 'Dewey',
 'Fisher',
 'Fithian',
 'Gifford',
 'Homer',
 'Ivesdale',
 'Joliet',
 'Mahomet',
 'Mansfield',
 'Monticello',
 'Ogden',
 'Philo',
 'Rantoul',
 'Saint Joseph',
 'Savoy',
 'Schaumburg',
 'Sidney',
 'St Joseph',
 'St. Joseph',
 'Thomasboro',
 'Tolono',
 'Tuscola',
 'Urbana',
 'Urbana, Illinois',
 'Villa Grove']
In [12]:
# Data cleaning. Divide cities into counties.
business['city'] = business['city'].apply(lambda city : city.lower())
boolcham = business['city'].apply(lambda x: x.startswith('broad') or x.startswith('homer') or x.startswith('ivesdale'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('dewey') or x.startswith('fisher') or x.startswith('gifford'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('mahomet') or x.startswith('ogden') or x.startswith('philo'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('rantoul') or x.startswith('savoy') or x.startswith('sidney'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('thomasboro') or x.startswith('tolono') or x.startswith('sidney'))
business.loc[boolcham,'county'] = 'champaign'
boolurb = business['city'].apply(lambda x: x.startswith('urbana'))
business.loc[boolurb,'city'] = 'urbana'
business.loc[boolurb,'county'] = 'champaign'
boolstj = business['city'].apply(lambda x: x.endswith('joseph'))
business.loc[boolstj,'city'] = 'st joseph'
business.loc[boolstj,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('champaign'))
business.loc[boolcham,'county'] = 'champaign'
boolverm = business['city'].apply(lambda x: x.endswith('fithian'))
business.loc[boolverm,'county'] = 'vermilion'
boolpiatt = business['city'].apply(lambda x: x.endswith('mansfield') or x.endswith('monticello'))
business.loc[boolpiatt,'county'] = 'piatt'
boolcook = business['city'].apply(lambda x: x.endswith('schaumburg'))
business.loc[boolcook,'county'] = 'cook'
booldouglas = business['city'].apply(lambda x: x.endswith('tuscola') or x.startswith('villa'))
business.loc[booldouglas,'county'] = 'douglas'
boolwill = business['city'].apply(lambda x: x.endswith('joliet'))
business.loc[boolwill,'county'] = 'will'
In [13]:
business.head()
Out[13]:
business_id name address city state postal_code latitude longitude review_count is_open attributes categories hours county
289 tsXCDIijxbgsh980VgRc9g Federal Companies 401 W Kenyon Rd champaign IL 61820 40.133919 -88.248628 11 1 {'BusinessAcceptsCreditCards': 'True', 'ByAppo... Movers, Local Services, Self Storage, Home Ser... {'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '... champaign
330 fsklFcY47qJIr0mjgobuUg Ford City 300 Carriage Center Ct champaign IL 61820 40.090444 -88.247430 3 0 None Car Dealers, Auto Repair, Auto Parts & Supplie... {'Monday': '8:0-19:0', 'Tuesday': '8:0-19:0', ... champaign
356 mofOjB6flg-eAWOFbOkHfQ ChinaTown Buffet 713 W Marketview Dr champaign IL 61822 40.137270 -88.256043 72 1 {'WiFi': 'u'no'', 'RestaurantsReservations': '... Sushi Bars, Buffets, Restaurants, Chinese {'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'... champaign
361 P4HqDYI1icascvcwca7iLg Macy's 2000 N Neil St champaign IL 61820 40.141029 -88.244222 12 1 {'BusinessAcceptsCreditCards': 'True', 'Restau... Men's Clothing, Shopping, Department Stores, F... {'Monday': '10:0-20:0', 'Tuesday': '9:0-22:0',... champaign
368 eezVjNlzIZrXs9GM5O8b2w Vape Vault 723 S Neil St champaign IL 61820 40.107126 -88.243912 5 1 {'RestaurantsPriceRange2': '2', 'BusinessAccep... Shopping, Vape Shops {'Monday': '0:0-0:0', 'Tuesday': '9:0-21:0', '... champaign

management of review data

In [14]:
review['text length'] = review[['text']].applymap(lambda x: len(x.split(" ")))
# transfer text into lower case
review['text1']=review['text'].apply(lambda sen:" ".join(x.lower() for x in sen.split()))
review[['text','text1']].head()
# delete the punctuation in the text
review['text2']=review['text1'].str.replace('[^\w\s]','')
review[['text1','text2']].head()
# delete the stop words in text.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english')
review['text3']=review['text2'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
review[['text2','text3']].head()
# delete scarce words in the text
freq = pd.Series(' '.join(review['text3']).split()).value_counts()[-10:]
freq = list(freq.index)
review['text4']=review['text3'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
review[['text3','text4']].head()
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaxun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[14]:
text3 text4
0 got stuck myrtle beach last week stayed waitin... got stuck myrtle beach last week stayed waitin...
1 unfortunately agree negative review thats post... unfortunately agree negative review thats post...
2 cant go wrong 24 hour starbucks service stayed... cant go wrong 24 hour starbucks service stayed...
3 expected much received first free internet pra... expected much received first free internet pra...
4 courtyard pretty par newly decorated courtyard... courtyard pretty par newly decorated courtyard...

Spell check

In [15]:
from textblob import TextBlob
review['text4'][:5].apply(lambda x: str(TextBlob(x).correct()))
Out[15]:
0    got stuck marble beach last week stayed waitin...
1    unfortunately agree negative review that poste...
2    can go wrong 24 hour starbucks service stayed ...
3    expected much received first free internet pra...
4    courtyard pretty par newly decorated courtyard...
Name: text4, dtype: object

Lemmatization

In [16]:
import nltk
nltk.download('wordnet')
from textblob import Word
review['text5'] = review['text4'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))
review[['text4','text5']]
[nltk_data] Downloading package wordnet to /Users/xiaxun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[16]:
text4 text5
0 got stuck myrtle beach last week stayed waitin... got stuck myrtle beach last week stayed waitin...
1 unfortunately agree negative review thats post... unfortunately agree negative review thats post...
2 cant go wrong 24 hour starbucks service stayed... cant go wrong 24 hour starbucks service stayed...
3 expected much received first free internet pra... expected much received first free internet pra...
4 courtyard pretty par newly decorated courtyard... courtyard pretty par newly decorated courtyard...
... ... ...
4984 got fried chicken wings basically combo fried ... got fried chicken wing basically combo fried o...
4985 deepfried chicken fish polish sausage wings fr... deepfried chicken fish polish sausage wing fry...
4986 lovely bb amazingly comfortable beds gracious ... lovely bb amazingly comfortable bed gracious h...
4987 beautiful little house friends wedding quite b... beautiful little house friend wedding quite bi...
4988 town attend family event stayed three nights w... town attend family event stayed three night wo...

42371 rows × 2 columns

EDA

EDA of Review text

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(lowercase=True, analyzer='word', stop_words= 'english')
tfidf_matrix =  tf.fit_transform(review['text5'])
tfidf = dict(zip(tf.get_feature_names(), tf.idf_))
tfidf = pd.DataFrame(columns = ['tfidf']).from_dict(dict(tfidf), orient = 'index')
tfidf.columns = ['tfidf']
In [18]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = tfidf.sort_values(by = ['tfidf'], ascending = True).head(50)
text = text.index
text = ' '.join(list(text))
wordcloud = WordCloud()
wordcloud.generate(text)
plt.figure(figsize = (25,25))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [19]:
#Use FacetGrid from the seaborn library to create a grid of 5 histograms 
#of text length based off of the star ratings.
#Reference the seaborn documentation for hints on this
sns.set_style('white')
g = sns.FacetGrid(review, col='stars')
g.map(plt.hist,'text length')
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x1a28f3bfd0>
In [20]:
#Create a boxplot of text length for each star category
sns.boxplot(x='stars', y='text length', data=review,palette='rainbow')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a38f8ad30>
In [21]:
#Create a countplot of the number of occurrences for each type of star rating.
sns.countplot(x='stars',data=review,palette='rainbow')
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a38bb6fd0>
In [22]:
#review = review.drop(['latitude','longitude'],axis=1)
#review_ = review.drop(['review_count','is_open'],axis=1)
stars = review.groupby('stars').mean()
stars
Out[22]:
useful funny cool latitude longitude review_count is_open text length
stars
1.0 2.786239 0.681564 0.236254 40.077449 -88.394974 59.112320 0.849309 139.652161
2.0 1.718132 0.606862 0.232785 40.114056 -88.247508 95.418394 0.808196 137.655945
3.0 1.065437 0.428191 0.290660 40.113471 -88.247053 110.972037 0.805973 119.000761
4.0 0.975552 0.345557 0.382537 40.113446 -88.239987 128.525732 0.818593 103.233590
5.0 1.087118 0.266606 0.391819 40.109932 -88.255314 124.235714 0.883578 89.506288
In [23]:
stars.corr()
Out[23]:
useful funny cool latitude longitude review_count is_open text length
useful 1.000000 0.880888 -0.743272 -0.901912 -0.923335 -0.978056 0.138226 0.772456
funny 0.880888 1.000000 -0.950651 -0.630276 -0.674551 -0.925578 -0.316143 0.979523
cool -0.743272 -0.950651 1.000000 0.449781 0.505782 0.841771 0.425343 -0.978545
latitude -0.901912 -0.630276 0.449781 1.000000 0.997913 0.857553 -0.369564 -0.489742
longitude -0.923335 -0.674551 0.505782 0.997913 1.000000 0.888374 -0.331269 -0.540820
review_count -0.978056 -0.925578 0.841771 0.857553 0.888374 1.000000 -0.010050 -0.850719
is_open 0.138226 -0.316143 0.425343 -0.369564 -0.331269 -0.010050 1.000000 -0.483692
text length 0.772456 0.979523 -0.978545 -0.489742 -0.540820 -0.850719 -0.483692 1.000000
In [24]:
sns.heatmap(stars.corr(), cmap='coolwarm',annot=True)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3a607208>

EDA of Business

review amounts of different counties in IL

In [25]:
sns.countplot(x = 'county', data = business)
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3834be80>

Compare business in Champaign, Piatt, Douglas and Cook.(Evanston locates in Cook)

In [26]:
rating_data=review[['latitude','longitude','stars','review_count']]
# Creating a custom column popularity using stars*no_of_reviews
rating_data['popularity']=rating_data['stars']*rating_data['review_count']
In [27]:
f, axes = plt.subplots(2, 2, figsize=(15,7))

#a random point inside Champaign
lat = 40.133919
lon = -88.248628
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for Champaign
ratings_data_champion=rating_data[(rating_data["longitude"]>lon_min) &\
                    (rating_data["longitude"]<lon_max) &\
                    (rating_data["latitude"]>lat_min) &\
                    (rating_data["latitude"]<lat_max)]

#Facet scatter plot
ratings_data_champion.plot(kind='scatter', x='longitude', y='latitude',
                color='yellow', 
                s=.02, alpha=.6, subplots=True, ax=axes[0,0])
axes[0,0].set_title("Champaign")
axes[0,0].set_facecolor('black')

#a random point inside piatt
lat = 40.035074
lon = -88.569312
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for piatt
ratings_data_piatt=rating_data[(rating_data["longitude"]>lon_min) &\
                    (rating_data["longitude"]<lon_max) &\
                    (rating_data["latitude"]>lat_min) &\
                    (rating_data["latitude"]<lat_max)]
#plot piatt
ratings_data_piatt.plot(kind='scatter', x='longitude', y='latitude',
                color='yellow', 
                s=.02, alpha=.6, subplots=True, ax=axes[0,1])
axes[0,1].set_title("Piatt")
axes[0,1].set_facecolor('black')

#a random point inside douglas
lat = 39.821199
lon = -88.246201
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.01
lat_min, lat_max = lat-0.2,lat+0.01
#subset for dougles
ratings_data_dougles=rating_data[(rating_data["longitude"]>lon_min) &\
                    (rating_data["longitude"]<lon_max) &\
                    (rating_data["latitude"]>lat_min) &\
                    (rating_data["latitude"]<lat_max)]
#plot douglas
ratings_data_dougles.plot(kind='scatter', x='longitude', y='latitude',
                color='yellow', 
                s=.02, alpha=.6, subplots=True, ax=axes[1,0])
axes[1,0].set_title("Douglas")
axes[1,0].set_facecolor('black')

#a random point inside cook
lat = 33.639053
lon = -112.208011
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for cook
ratings_data_cook=rating_data[(rating_data["longitude"]>lon_min) &\
                    (rating_data["longitude"]<lon_max) &\
                    (rating_data["latitude"]>lat_min) &\
                    (rating_data["latitude"]<lat_max)]
#plot cook
ratings_data_cook.plot(kind='scatter', x='longitude', y='latitude',
                color='yellow', 
                s=.02, alpha=.6, subplots=True, ax=axes[1,1])
axes[1,1].set_title("Cook")
axes[1,1].set_facecolor('black')


plt.tight_layout(pad=1.5)  
f.show()

Ratings in Champaign:

Lets take a view of how people rated different businesses in Champaign. The following is an interactive Animation, where we use the awesome Folium package to create stunning Leaflet map visuals. Here, in this animation, we are showing the highlighting businesses based on their Star ratings. The intention was to see if there are certain hotspots/concentrations where there are awesome Restaurants. It turns out good and bad businesses are peppered around the city quite evenly.

In [28]:
data=[]
#rearranging data to suit the format needed for folium
stars_list=list(rating_data['stars'].unique())
for star in stars_list:
    subset=ratings_data_champion[ratings_data_champion['stars']==star]
    data.append(subset[['latitude','longitude']].values.tolist())
#initialize at champaign
lat = 40.133919
lon = -88.248628
zoom_start=11
print("                     Champaign Review heatmap Animation ")

# basic map
m = folium.Map(location=[lat, lon], tiles="OpenStreetMap", zoom_start=zoom_start)
#inprovising the Heatmapwith time plugin to show variations across star ratings 
hm = plugins.HeatMapWithTime(data,max_opacity=0.3,auto_play=True,display_index=True,radius=20)
hm.add_to(m)
m
                     Champaign Review heatmap Animation 
Out[28]:
In [29]:
x=business.categories.value_counts()
print("In IL, there are ",len(x)," different types/categories of Businesses in Yelp!")
In IL, there are  1504  different types/categories of Businesses in Yelp!
In [30]:
x=x.sort_values(ascending=False)
x=x.iloc[0:20]
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.5)#,color=color[5])
plt.title("What are the top categories?",fontsize=25)
locs, labels = plt.xticks()
plt.setp(labels, rotation=80)
plt.ylabel('# businesses', fontsize=12)
plt.xlabel('Category', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()
In [46]:
business.name.value_counts().index[:10].tolist()
Out[46]:
['Subway',
 "McDonald's",
 "Monical's Pizza",
 "Jimmy John's",
 'Walgreens',
 'Starbucks',
 "Arby's",
 'Dairy Queen',
 "Domino's Pizza",
 'Espresso Royale']
In [55]:
#Plotting top 25 most reviewed businesses among all categories
ax = sns.catplot(x="review_count", y="name",data= business.nlargest(10,'review_count'), 
                 kind="bar",hue= "categories", dodge= False, height= 10 )

plt.subplots_adjust(top=0.9)
ax.fig.suptitle('Top 10 Most Reviewed Businesses And Categories Lables Used') # can also get the figure from plt.gcf()
Out[55]:
Text(0.5, 0.98, 'Top 10 Most Reviewed Businesses And Categories Lables Used')
In [57]:
business['Num_Keywords'] = business['categories'].str.len()

#Top 20 categories with most keyword
business[['categories','Num_Keywords']].sort_values('Num_Keywords',ascending = False).head(10)
Out[57]:
categories Num_Keywords
57025 Flight Instruction, Hot Air Balloons, Specialt... 250.0
83490 Shades & Blinds, Door Sales/Installation, Awni... 250.0
123898 Specialty Schools, Dance Clubs, Dance Schools,... 249.0
159784 Department Stores, Building Supplies, Books, M... 242.0
27936 Home Services, Professional Services, Home & G... 236.0
63391 Professional Services, Damage Restoration, Sho... 231.0
39635 Bars, Arts & Entertainment, Bowling, Event Pla... 229.0
140324 Tiling, Auto Detailing, Carpeting, Home Servic... 227.0
148710 Professional Services, Arts & Crafts, Local Se... 224.0
84407 Pool Halls, Furniture Stores, Toy Stores, Spor... 216.0

We can see that some businesses are using as much as 37 keywords in their categories. Let's look at some of these businesses.

How about the overall distribution of the number of keywords used by businesses? Let's discern that information.

In [59]:
fig = plt.figure()
ax = fig.add_subplot(111)

x = business['Num_Keywords']
numBins = 100
ax.hist(x,numBins,color='green',alpha=0.7)
plt.show()

NLP

NLP Classification Task

In [31]:
review_class = review
X_ = review_class['text']
y_ = review_class['stars']
X = X_
y = y_
index_1 = y_==1
y_tmp1 = y_[index_1]
x_tmp1 = X_[index_1]
index_2 = y_==2
y_tmp2 = y_[index_2]
x_tmp2 = X_[index_2]
index_3 = y_==3
y_tmp3 = y_[index_3]
x_tmp3 = X_[index_3]
index_4 = y_==4
y_tmp4= y_[index_4]
x_tmp4 = X_[index_4]
X = pd.concat([X_,x_tmp1,x_tmp2,x_tmp2,x_tmp2,x_tmp3,x_tmp3,x_tmp4])
y = pd.concat([y_,y_tmp1,y_tmp2,y_tmp2,y_tmp2,y_tmp3,y_tmp3, y_tmp4])
In [32]:
#Import CountVectorizer and create a CountVectorizer object
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)
predictions = nb.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
[[2742  911  299   92   34]
 [ 891 2839  904  356   49]
 [ 367  527 2656 1024  163]
 [ 226  267  586 3947  820]
 [ 365   95  149 1737 2558]]


              precision    recall  f1-score   support

         1.0       0.60      0.67      0.63      4078
         2.0       0.61      0.56      0.59      5039
         3.0       0.58      0.56      0.57      4737
         4.0       0.55      0.68      0.61      5846
         5.0       0.71      0.52      0.60      4904

    accuracy                           0.60     24604
   macro avg       0.61      0.60      0.60     24604
weighted avg       0.61      0.60      0.60     24604

Using Text Processing

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('bow', TfidfVectorizer(lowercase=True, analyzer='word', stop_words= 'english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])
In [35]:
X_ = review_class['text5']
y_ = review_class['stars']
X = X_
y = y_
index_1 = y_==1
y_tmp1 = y_[index_1]
x_tmp1 = X_[index_1]
index_2 = y_==2
y_tmp2 = y_[index_2]
x_tmp2 = X_[index_2]
index_3 = y_==3
y_tmp3 = y_[index_3]
x_tmp3 = X_[index_3]
index_4 = y_==4
y_tmp4= y_[index_4]
x_tmp4 = X_[index_4]
X = pd.concat([X_,x_tmp1,x_tmp2,x_tmp2,x_tmp2,x_tmp3,x_tmp3,x_tmp4])
y = pd.concat([y_,y_tmp1,y_tmp2,y_tmp2,y_tmp2,y_tmp3,y_tmp3, y_tmp4])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
pipeline.fit(X_train,y_train)
predictions = pipeline.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
[[2456 1231  140  217   34]
 [ 435 3896  160  527   21]
 [ 222  644 2235 1548   88]
 [ 152  266  168 4925  335]
 [ 238  149   98 2799 1620]]
              precision    recall  f1-score   support

         1.0       0.70      0.60      0.65      4078
         2.0       0.63      0.77      0.69      5039
         3.0       0.80      0.47      0.59      4737
         4.0       0.49      0.84      0.62      5846
         5.0       0.77      0.33      0.46      4904

    accuracy                           0.62     24604
   macro avg       0.68      0.60      0.60     24604
weighted avg       0.67      0.62      0.60     24604

In [ ]: